{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "     aisle_id                       aisle\n0           1       prepared soups salads\n1           2           specialty cheeses\n2           3         energy granola bars\n3           4               instant foods\n4           5  marinades meat preparation\n..        ...                         ...\n129       130    hot cereal pancake mixes\n130       131                   dry pasta\n131       132                      beauty\n132       133  muscles joints pain relief\n133       134  specialty wines champagnes\n\n[134 rows x 2 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>aisle_id</th>\n      <th>aisle</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>prepared soups salads</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>specialty cheeses</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>energy granola bars</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>4</td>\n      <td>instant foods</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>5</td>\n      <td>marinades meat preparation</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>129</th>\n      <td>130</td>\n      <td>hot cereal pancake mixes</td>\n    </tr>\n    <tr>\n      <th>130</th>\n      <td>131</td>\n      <td>dry pasta</td>\n    </tr>\n    <tr>\n      <th>131</th>\n      <td>132</td>\n      <td>beauty</td>\n    </tr>\n    <tr>\n      <th>132</th>\n      <td>133</td>\n      <td>muscles joints pain relief</td>\n    </tr>\n    <tr>\n      <th>133</th>\n      <td>134</td>\n      <td>specialty wines champagnes</td>\n    </tr>\n  </tbody>\n</table>\n<p>134 rows × 2 columns</p>\n</div>"
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "order_products  = pd.read_csv(\"C:\\\\Users\\\\Administrator\\\\Desktop\\data\\\\order_products__prior.csv\")\n",
    "products  = pd.read_csv(\"C:\\\\Users\\\\Administrator\\\\Desktop\\data\\\\products.csv\")\n",
    "orders  = pd.read_csv(\"C:\\\\Users\\\\Administrator\\\\Desktop\\data\\\\orders.csv\")\n",
    "aisles  = pd.read_csv(\"C:\\\\Users\\\\Administrator\\\\Desktop\\data\\\\aisles.csv\")\n",
    "aisles"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "data": {
      "text/plain": "          order_id  user_id eval_set  order_number  order_dow  \\\n0          2539329        1    prior             1          2   \n1          2398795        1    prior             2          3   \n2           473747        1    prior             3          3   \n3          2254736        1    prior             4          4   \n4           431534        1    prior             5          4   \n...            ...      ...      ...           ...        ...   \n32434484   2231925   204495    prior            51          1   \n32434485    327001   204495    prior            53          2   \n32434486   1997103   110030    prior             4          2   \n32434487   1362143   113181    prior            33          3   \n32434488    777464   179210    prior             7          5   \n\n          order_hour_of_day  days_since_prior_order  product_id  \\\n0                         8                     NaN         196   \n1                         7                    15.0         196   \n2                        12                    21.0         196   \n3                         7                    29.0         196   \n4                        15                    28.0         196   \n...                     ...                     ...         ...   \n32434484                 15                     9.0        2642   \n32434485                  8                     7.0        2642   \n32434486                 16                     5.0       24189   \n32434487                 17                     5.0       24189   \n32434488                 15                    20.0       24189   \n\n          add_to_cart_order  reordered  \\\n0                         1          0   \n1                         1          1   \n2                         1          1   \n3                         1          1   \n4                         1          1   \n...                     ...        ...   \n32434484                  8          1   \n32434485                  1          1   \n32434486                  8          0   \n32434487                 12          0   \n32434488                 16          0   \n\n                                               product_name  aisle_id  \\\n0                                                      Soda        77   \n1                                                      Soda        77   \n2                                                      Soda        77   \n3                                                      Soda        77   \n4                                                      Soda        77   \n...                                                     ...       ...   \n32434484  Frozen Concentrated Orange Juice With Added Ca...       113   \n32434485  Frozen Concentrated Orange Juice With Added Ca...       113   \n32434486   Tropical Fruit Smoothie Tasty American Favorites       113   \n32434487   Tropical Fruit Smoothie Tasty American Favorites       113   \n32434488   Tropical Fruit Smoothie Tasty American Favorites       113   \n\n          department_id         aisle  \n0                     7   soft drinks  \n1                     7   soft drinks  \n2                     7   soft drinks  \n3                     7   soft drinks  \n4                     7   soft drinks  \n...                 ...           ...  \n32434484              1  frozen juice  \n32434485              1  frozen juice  \n32434486              1  frozen juice  \n32434487              1  frozen juice  \n32434488              1  frozen juice  \n\n[32434489 rows x 14 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>order_id</th>\n      <th>user_id</th>\n      <th>eval_set</th>\n      <th>order_number</th>\n      <th>order_dow</th>\n      <th>order_hour_of_day</th>\n      <th>days_since_prior_order</th>\n      <th>product_id</th>\n      <th>add_to_cart_order</th>\n      <th>reordered</th>\n      <th>product_name</th>\n      <th>aisle_id</th>\n      <th>department_id</th>\n      <th>aisle</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2539329</td>\n      <td>1</td>\n      <td>prior</td>\n      <td>1</td>\n      <td>2</td>\n      <td>8</td>\n      <td>NaN</td>\n      <td>196</td>\n      <td>1</td>\n      <td>0</td>\n      <td>Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2398795</td>\n      <td>1</td>\n      <td>prior</td>\n      <td>2</td>\n      <td>3</td>\n      <td>7</td>\n      <td>15.0</td>\n      <td>196</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>473747</td>\n      <td>1</td>\n      <td>prior</td>\n      <td>3</td>\n      <td>3</td>\n      <td>12</td>\n      <td>21.0</td>\n      <td>196</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2254736</td>\n      <td>1</td>\n      <td>prior</td>\n      <td>4</td>\n      <td>4</td>\n      <td>7</td>\n      <td>29.0</td>\n      <td>196</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>431534</td>\n      <td>1</td>\n      <td>prior</td>\n      <td>5</td>\n      <td>4</td>\n      <td>15</td>\n      <td>28.0</td>\n      <td>196</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>32434484</th>\n      <td>2231925</td>\n      <td>204495</td>\n      <td>prior</td>\n      <td>51</td>\n      <td>1</td>\n      <td>15</td>\n      <td>9.0</td>\n      <td>2642</td>\n      <td>8</td>\n      <td>1</td>\n      <td>Frozen Concentrated Orange Juice With Added Ca...</td>\n      <td>113</td>\n      <td>1</td>\n      <td>frozen juice</td>\n    </tr>\n    <tr>\n      <th>32434485</th>\n      <td>327001</td>\n      <td>204495</td>\n      <td>prior</td>\n      <td>53</td>\n      <td>2</td>\n      <td>8</td>\n      <td>7.0</td>\n      <td>2642</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Frozen Concentrated Orange Juice With Added Ca...</td>\n      <td>113</td>\n      <td>1</td>\n      <td>frozen juice</td>\n    </tr>\n    <tr>\n      <th>32434486</th>\n      <td>1997103</td>\n      <td>110030</td>\n      <td>prior</td>\n      <td>4</td>\n      <td>2</td>\n      <td>16</td>\n      <td>5.0</td>\n      <td>24189</td>\n      <td>8</td>\n      <td>0</td>\n      <td>Tropical Fruit Smoothie Tasty American Favorites</td>\n      <td>113</td>\n      <td>1</td>\n      <td>frozen juice</td>\n    </tr>\n    <tr>\n      <th>32434487</th>\n      <td>1362143</td>\n      <td>113181</td>\n      <td>prior</td>\n      <td>33</td>\n      <td>3</td>\n      <td>17</td>\n      <td>5.0</td>\n      <td>24189</td>\n      <td>12</td>\n      <td>0</td>\n      <td>Tropical Fruit Smoothie Tasty American Favorites</td>\n      <td>113</td>\n      <td>1</td>\n      <td>frozen juice</td>\n    </tr>\n    <tr>\n      <th>32434488</th>\n      <td>777464</td>\n      <td>179210</td>\n      <td>prior</td>\n      <td>7</td>\n      <td>5</td>\n      <td>15</td>\n      <td>20.0</td>\n      <td>24189</td>\n      <td>16</td>\n      <td>0</td>\n      <td>Tropical Fruit Smoothie Tasty American Favorites</td>\n      <td>113</td>\n      <td>1</td>\n      <td>frozen juice</td>\n    </tr>\n  </tbody>\n</table>\n<p>32434489 rows × 14 columns</p>\n</div>"
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "tab1 = pd.merge(orders,order_products,on=[\"order_id\",\"order_id\"])\n",
    "tab2 = pd.merge(tab1,products,on=[\"product_id\",\"product_id\"])\n",
    "tab3 = pd.merge(tab2,aisles,on=[\"aisle_id\",\"aisle_id\"])\n",
    "tab3"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "data": {
      "text/plain": "        order_id  user_id eval_set  order_number  order_dow  \\\n200000    398155   134837    prior            30          3   \n200001   3306177   134837    prior            31          4   \n200002   1025645   134837    prior            32          4   \n200003   2802216   134837    prior            33          2   \n200004   2091511   134837    prior            34          3   \n...          ...      ...      ...           ...        ...   \n219995    663658    99439    prior            39          3   \n219996   2561342    99439    prior            40          4   \n219997   1360237    99439    prior            41          6   \n219998   2339386    99439    prior            42          3   \n219999   3034988    99439    prior            43          1   \n\n        order_hour_of_day  days_since_prior_order  product_id  \\\n200000                 11                     3.0       12140   \n200001                 16                    15.0       12140   \n200002                 12                     7.0       12140   \n200003                 18                     5.0       12140   \n200004                 16                     8.0       12140   \n...                   ...                     ...         ...   \n219995                 12                     7.0       18564   \n219996                 11                     8.0       18564   \n219997                 13                     9.0       18564   \n219998                 15                    11.0       18564   \n219999                 10                    12.0       18564   \n\n        add_to_cart_order  reordered                      product_name  \\\n200000                  1          1                         Diet Soda   \n200001                  1          1                         Diet Soda   \n200002                  1          1                         Diet Soda   \n200003                  1          1                         Diet Soda   \n200004                  1          1                         Diet Soda   \n...                   ...        ...                               ...   \n219995                  2          1  Diet Ginger Ale All Natural Soda   \n219996                  1          1  Diet Ginger Ale All Natural Soda   \n219997                  4          1  Diet Ginger Ale All Natural Soda   \n219998                  1          1  Diet Ginger Ale All Natural Soda   \n219999                  3          1  Diet Ginger Ale All Natural Soda   \n\n        aisle_id  department_id        aisle  \n200000        77              7  soft drinks  \n200001        77              7  soft drinks  \n200002        77              7  soft drinks  \n200003        77              7  soft drinks  \n200004        77              7  soft drinks  \n...          ...            ...          ...  \n219995        77              7  soft drinks  \n219996        77              7  soft drinks  \n219997        77              7  soft drinks  \n219998        77              7  soft drinks  \n219999        77              7  soft drinks  \n\n[20000 rows x 14 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>order_id</th>\n      <th>user_id</th>\n      <th>eval_set</th>\n      <th>order_number</th>\n      <th>order_dow</th>\n      <th>order_hour_of_day</th>\n      <th>days_since_prior_order</th>\n      <th>product_id</th>\n      <th>add_to_cart_order</th>\n      <th>reordered</th>\n      <th>product_name</th>\n      <th>aisle_id</th>\n      <th>department_id</th>\n      <th>aisle</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>200000</th>\n      <td>398155</td>\n      <td>134837</td>\n      <td>prior</td>\n      <td>30</td>\n      <td>3</td>\n      <td>11</td>\n      <td>3.0</td>\n      <td>12140</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>200001</th>\n      <td>3306177</td>\n      <td>134837</td>\n      <td>prior</td>\n      <td>31</td>\n      <td>4</td>\n      <td>16</td>\n      <td>15.0</td>\n      <td>12140</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>200002</th>\n      <td>1025645</td>\n      <td>134837</td>\n      <td>prior</td>\n      <td>32</td>\n      <td>4</td>\n      <td>12</td>\n      <td>7.0</td>\n      <td>12140</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>200003</th>\n      <td>2802216</td>\n      <td>134837</td>\n      <td>prior</td>\n      <td>33</td>\n      <td>2</td>\n      <td>18</td>\n      <td>5.0</td>\n      <td>12140</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>200004</th>\n      <td>2091511</td>\n      <td>134837</td>\n      <td>prior</td>\n      <td>34</td>\n      <td>3</td>\n      <td>16</td>\n      <td>8.0</td>\n      <td>12140</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>219995</th>\n      <td>663658</td>\n      <td>99439</td>\n      <td>prior</td>\n      <td>39</td>\n      <td>3</td>\n      <td>12</td>\n      <td>7.0</td>\n      <td>18564</td>\n      <td>2</td>\n      <td>1</td>\n      <td>Diet Ginger Ale All Natural Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>219996</th>\n      <td>2561342</td>\n      <td>99439</td>\n      <td>prior</td>\n      <td>40</td>\n      <td>4</td>\n      <td>11</td>\n      <td>8.0</td>\n      <td>18564</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Ginger Ale All Natural Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>219997</th>\n      <td>1360237</td>\n      <td>99439</td>\n      <td>prior</td>\n      <td>41</td>\n      <td>6</td>\n      <td>13</td>\n      <td>9.0</td>\n      <td>18564</td>\n      <td>4</td>\n      <td>1</td>\n      <td>Diet Ginger Ale All Natural Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>219998</th>\n      <td>2339386</td>\n      <td>99439</td>\n      <td>prior</td>\n      <td>42</td>\n      <td>3</td>\n      <td>15</td>\n      <td>11.0</td>\n      <td>18564</td>\n      <td>1</td>\n      <td>1</td>\n      <td>Diet Ginger Ale All Natural Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n    <tr>\n      <th>219999</th>\n      <td>3034988</td>\n      <td>99439</td>\n      <td>prior</td>\n      <td>43</td>\n      <td>1</td>\n      <td>10</td>\n      <td>12.0</td>\n      <td>18564</td>\n      <td>3</td>\n      <td>1</td>\n      <td>Diet Ginger Ale All Natural Soda</td>\n      <td>77</td>\n      <td>7</td>\n      <td>soft drinks</td>\n    </tr>\n  </tbody>\n</table>\n<p>20000 rows × 14 columns</p>\n</div>"
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table = pd.crosstab(tab3[\"user_id\"],tab3[\"aisle\"])\n",
    "table"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [
    {
     "data": {
      "text/plain": "aisle    soft drinks\nuser_id             \n125                1\n127               18\n133                3\n140                1\n142                3\n...              ...\n206006             1\n206029             1\n206155             1\n206174             2\n206206             2\n\n[5970 rows x 1 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>aisle</th>\n      <th>soft drinks</th>\n    </tr>\n    <tr>\n      <th>user_id</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>125</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>127</th>\n      <td>18</td>\n    </tr>\n    <tr>\n      <th>133</th>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>140</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>142</th>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>206006</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>206029</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>206155</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>206174</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>206206</th>\n      <td>2</td>\n    </tr>\n  </tbody>\n</table>\n<p>5970 rows × 1 columns</p>\n</div>"
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "transfer = PCA(n_components=0.95)\n",
    "result =  transfer.fit_transform(table)\n",
    "print(result)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "#kmean分组"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}